from pathlib import Path
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
= Path.cwd().parent.parent PROJECT_ROOT
XGBoost
= pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/train.csv", low_memory=False)
train_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/test.csv", low_memory=False)
test_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/store.csv", low_memory=False)
store_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/sample_submission.csv", low_memory=False) sample_submission_df
train_df
Store | DayOfWeek | Date | Sales | Customers | Open | Promo | StateHoliday | SchoolHoliday | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2015-07-31 | 5263 | 555 | 1 | 1 | 0 | 1 |
1 | 2 | 5 | 2015-07-31 | 6064 | 625 | 1 | 1 | 0 | 1 |
2 | 3 | 5 | 2015-07-31 | 8314 | 821 | 1 | 1 | 0 | 1 |
3 | 4 | 5 | 2015-07-31 | 13995 | 1498 | 1 | 1 | 0 | 1 |
4 | 5 | 5 | 2015-07-31 | 4822 | 559 | 1 | 1 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1017204 | 1111 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 |
1017205 | 1112 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 |
1017206 | 1113 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 |
1017207 | 1114 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 |
1017208 | 1115 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 |
1017209 rows × 9 columns
test_df
Id | Store | DayOfWeek | Date | Open | Promo | StateHoliday | SchoolHoliday | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 4 | 2015-09-17 | 1.0 | 1 | 0 | 0 |
1 | 2 | 3 | 4 | 2015-09-17 | 1.0 | 1 | 0 | 0 |
2 | 3 | 7 | 4 | 2015-09-17 | 1.0 | 1 | 0 | 0 |
3 | 4 | 8 | 4 | 2015-09-17 | 1.0 | 1 | 0 | 0 |
4 | 5 | 9 | 4 | 2015-09-17 | 1.0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
41083 | 41084 | 1111 | 6 | 2015-08-01 | 1.0 | 0 | 0 | 0 |
41084 | 41085 | 1112 | 6 | 2015-08-01 | 1.0 | 0 | 0 | 0 |
41085 | 41086 | 1113 | 6 | 2015-08-01 | 1.0 | 0 | 0 | 0 |
41086 | 41087 | 1114 | 6 | 2015-08-01 | 1.0 | 0 | 0 | 0 |
41087 | 41088 | 1115 | 6 | 2015-08-01 | 1.0 | 0 | 0 | 1 |
41088 rows × 8 columns
store_df
Store | StoreType | Assortment | CompetitionDistance | CompetitionOpenSinceMonth | CompetitionOpenSinceYear | Promo2 | Promo2SinceWeek | Promo2SinceYear | PromoInterval | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | c | a | 1270.0 | 9.0 | 2008.0 | 0 | NaN | NaN | NaN |
1 | 2 | a | a | 570.0 | 11.0 | 2007.0 | 1 | 13.0 | 2010.0 | Jan,Apr,Jul,Oct |
2 | 3 | a | a | 14130.0 | 12.0 | 2006.0 | 1 | 14.0 | 2011.0 | Jan,Apr,Jul,Oct |
3 | 4 | c | c | 620.0 | 9.0 | 2009.0 | 0 | NaN | NaN | NaN |
4 | 5 | a | a | 29910.0 | 4.0 | 2015.0 | 0 | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1110 | 1111 | a | a | 1900.0 | 6.0 | 2014.0 | 1 | 31.0 | 2013.0 | Jan,Apr,Jul,Oct |
1111 | 1112 | c | c | 1880.0 | 4.0 | 2006.0 | 0 | NaN | NaN | NaN |
1112 | 1113 | a | c | 9260.0 | NaN | NaN | 0 | NaN | NaN | NaN |
1113 | 1114 | a | c | 870.0 | NaN | NaN | 0 | NaN | NaN | NaN |
1114 | 1115 | d | c | 5350.0 | NaN | NaN | 1 | 22.0 | 2012.0 | Mar,Jun,Sept,Dec |
1115 rows × 10 columns
sample_submission_df
Id | Sales | |
---|---|---|
0 | 1 | 0 |
1 | 2 | 0 |
2 | 3 | 0 |
3 | 4 | 0 |
4 | 5 | 0 |
... | ... | ... |
41083 | 41084 | 0 |
41084 | 41085 | 0 |
41085 | 41086 | 0 |
41086 | 41087 | 0 |
41087 | 41088 | 0 |
41088 rows × 2 columns
= pd.merge(train_df, store_df, on="Store", how="left")
train_df = pd.merge(test_df, store_df, on="Store", how="left") test_df
def split_date(df):
"Date"] = pd.to_datetime(df["Date"])
df["Year"] = df.Date.dt.year
df["Month"] = df.Date.dt.month
df["Day"] = df.Date.dt.day
df["WeekOfYear"] = df.Date.dt.isocalendar().week
df[return df
= split_date(train_df)
train_df = split_date(test_df) test_df
train_df
Store | DayOfWeek | Date | Sales | Customers | Open | Promo | StateHoliday | SchoolHoliday | StoreType | ... | CompetitionOpenSinceMonth | CompetitionOpenSinceYear | Promo2 | Promo2SinceWeek | Promo2SinceYear | PromoInterval | Year | Month | Day | WeekOfYear | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2015-07-31 | 5263 | 555 | 1 | 1 | 0 | 1 | c | ... | 9.0 | 2008.0 | 0 | NaN | NaN | NaN | 2015 | 7 | 31 | 31 |
1 | 2 | 5 | 2015-07-31 | 6064 | 625 | 1 | 1 | 0 | 1 | a | ... | 11.0 | 2007.0 | 1 | 13.0 | 2010.0 | Jan,Apr,Jul,Oct | 2015 | 7 | 31 | 31 |
2 | 3 | 5 | 2015-07-31 | 8314 | 821 | 1 | 1 | 0 | 1 | a | ... | 12.0 | 2006.0 | 1 | 14.0 | 2011.0 | Jan,Apr,Jul,Oct | 2015 | 7 | 31 | 31 |
3 | 4 | 5 | 2015-07-31 | 13995 | 1498 | 1 | 1 | 0 | 1 | c | ... | 9.0 | 2009.0 | 0 | NaN | NaN | NaN | 2015 | 7 | 31 | 31 |
4 | 5 | 5 | 2015-07-31 | 4822 | 559 | 1 | 1 | 0 | 1 | a | ... | 4.0 | 2015.0 | 0 | NaN | NaN | NaN | 2015 | 7 | 31 | 31 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1017204 | 1111 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 | a | ... | 6.0 | 2014.0 | 1 | 31.0 | 2013.0 | Jan,Apr,Jul,Oct | 2013 | 1 | 1 | 1 |
1017205 | 1112 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 | c | ... | 4.0 | 2006.0 | 0 | NaN | NaN | NaN | 2013 | 1 | 1 | 1 |
1017206 | 1113 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 | a | ... | NaN | NaN | 0 | NaN | NaN | NaN | 2013 | 1 | 1 | 1 |
1017207 | 1114 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 | a | ... | NaN | NaN | 0 | NaN | NaN | NaN | 2013 | 1 | 1 | 1 |
1017208 | 1115 | 2 | 2013-01-01 | 0 | 0 | 0 | 0 | a | 1 | d | ... | NaN | NaN | 1 | 22.0 | 2012.0 | Mar,Jun,Sept,Dec | 2013 | 1 | 1 | 1 |
1017209 rows × 22 columns
= train_df.query("Open == 1").copy() train_df
train_df
Store | DayOfWeek | Date | Sales | Customers | Open | Promo | StateHoliday | SchoolHoliday | StoreType | ... | CompetitionOpenSinceMonth | CompetitionOpenSinceYear | Promo2 | Promo2SinceWeek | Promo2SinceYear | PromoInterval | Year | Month | Day | WeekOfYear | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2015-07-31 | 5263 | 555 | 1 | 1 | 0 | 1 | c | ... | 9.0 | 2008.0 | 0 | NaN | NaN | NaN | 2015 | 7 | 31 | 31 |
1 | 2 | 5 | 2015-07-31 | 6064 | 625 | 1 | 1 | 0 | 1 | a | ... | 11.0 | 2007.0 | 1 | 13.0 | 2010.0 | Jan,Apr,Jul,Oct | 2015 | 7 | 31 | 31 |
2 | 3 | 5 | 2015-07-31 | 8314 | 821 | 1 | 1 | 0 | 1 | a | ... | 12.0 | 2006.0 | 1 | 14.0 | 2011.0 | Jan,Apr,Jul,Oct | 2015 | 7 | 31 | 31 |
3 | 4 | 5 | 2015-07-31 | 13995 | 1498 | 1 | 1 | 0 | 1 | c | ... | 9.0 | 2009.0 | 0 | NaN | NaN | NaN | 2015 | 7 | 31 | 31 |
4 | 5 | 5 | 2015-07-31 | 4822 | 559 | 1 | 1 | 0 | 1 | a | ... | 4.0 | 2015.0 | 0 | NaN | NaN | NaN | 2015 | 7 | 31 | 31 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1016776 | 682 | 2 | 2013-01-01 | 3375 | 566 | 1 | 0 | a | 1 | b | ... | 9.0 | 2006.0 | 0 | NaN | NaN | NaN | 2013 | 1 | 1 | 1 |
1016827 | 733 | 2 | 2013-01-01 | 10765 | 2377 | 1 | 0 | a | 1 | b | ... | 10.0 | 1999.0 | 0 | NaN | NaN | NaN | 2013 | 1 | 1 | 1 |
1016863 | 769 | 2 | 2013-01-01 | 5035 | 1248 | 1 | 0 | a | 1 | b | ... | NaN | NaN | 1 | 48.0 | 2012.0 | Jan,Apr,Jul,Oct | 2013 | 1 | 1 | 1 |
1017042 | 948 | 2 | 2013-01-01 | 4491 | 1039 | 1 | 0 | a | 1 | b | ... | NaN | NaN | 0 | NaN | NaN | NaN | 2013 | 1 | 1 | 1 |
1017190 | 1097 | 2 | 2013-01-01 | 5961 | 1405 | 1 | 0 | a | 1 | b | ... | 3.0 | 2002.0 | 0 | NaN | NaN | NaN | 2013 | 1 | 1 | 1 |
844392 rows × 22 columns
def comp_months(df):
"CompetitionOpen"] = 12 * (df["Year"] - df["CompetitionOpenSinceYear"]) + (
df["Month"] - df["CompetitionOpenSinceMonth"]
df[
)"CompetitionOpen"] = df["CompetitionOpen"].map(lambda x: 0 if x < 0 else x).fillna(0)
df[return df
= comp_months(train_df)
train_df = comp_months(test_df) test_df
def check_promo_month(row):
= {
month2str 1: "Jan",
2: "Feb",
3: "Mar",
4: "Apr",
5: "May",
6: "Jun",
7: "Jul",
8: "Aug",
9: "Sept",
10: "Oct",
11: "Nov",
12: "Dec",
}try:
# given input row the promointerval column is indexed and split by ","
= (row["PromoInterval"] or "").split(",")
months # if the row indexed promo2open column and the row month are in months you return 1
if row["Promo2Open"] and month2str[row["Month"]] in months:
return 1
else:
return 0
except Exception:
return 0
def promo_cols(df):
# Months since Promo2 was open
"Promo2Open"] = 12 * (df.Year - df.Promo2SinceYear) + (df.WeekOfYear - df.Promo2SinceWeek) * 7 / 30.5
df["Promo2Open"] = df["Promo2Open"].fillna(0).map(lambda x: 0 if x < 0 else x) * df["Promo2"]
df[# Whether a new round of promotions was started in the current month
"IsPromo2Month"] = df.apply(check_promo_month, axis=1) * df["Promo2"]
df[return df
= promo_cols(train_df)
train_df = promo_cols(test_df) test_df
train_df
Store | DayOfWeek | Date | Sales | Customers | Open | Promo | StateHoliday | SchoolHoliday | StoreType | ... | Promo2SinceWeek | Promo2SinceYear | PromoInterval | Year | Month | Day | WeekOfYear | CompetitionOpen | Promo2Open | IsPromo2Month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2015-07-31 | 5263 | 555 | 1 | 1 | 0 | 1 | c | ... | NaN | NaN | NaN | 2015 | 7 | 31 | 31 | 82.0 | 0.000000 | 0 |
1 | 2 | 5 | 2015-07-31 | 6064 | 625 | 1 | 1 | 0 | 1 | a | ... | 13.0 | 2010.0 | Jan,Apr,Jul,Oct | 2015 | 7 | 31 | 31 | 92.0 | 64.131148 | 1 |
2 | 3 | 5 | 2015-07-31 | 8314 | 821 | 1 | 1 | 0 | 1 | a | ... | 14.0 | 2011.0 | Jan,Apr,Jul,Oct | 2015 | 7 | 31 | 31 | 103.0 | 51.901639 | 1 |
3 | 4 | 5 | 2015-07-31 | 13995 | 1498 | 1 | 1 | 0 | 1 | c | ... | NaN | NaN | NaN | 2015 | 7 | 31 | 31 | 70.0 | 0.000000 | 0 |
4 | 5 | 5 | 2015-07-31 | 4822 | 559 | 1 | 1 | 0 | 1 | a | ... | NaN | NaN | NaN | 2015 | 7 | 31 | 31 | 3.0 | 0.000000 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1016776 | 682 | 2 | 2013-01-01 | 3375 | 566 | 1 | 0 | a | 1 | b | ... | NaN | NaN | NaN | 2013 | 1 | 1 | 1 | 76.0 | 0.000000 | 0 |
1016827 | 733 | 2 | 2013-01-01 | 10765 | 2377 | 1 | 0 | a | 1 | b | ... | NaN | NaN | NaN | 2013 | 1 | 1 | 1 | 159.0 | 0.000000 | 0 |
1016863 | 769 | 2 | 2013-01-01 | 5035 | 1248 | 1 | 0 | a | 1 | b | ... | 48.0 | 2012.0 | Jan,Apr,Jul,Oct | 2013 | 1 | 1 | 1 | 0.0 | 1.213115 | 1 |
1017042 | 948 | 2 | 2013-01-01 | 4491 | 1039 | 1 | 0 | a | 1 | b | ... | NaN | NaN | NaN | 2013 | 1 | 1 | 1 | 0.0 | 0.000000 | 0 |
1017190 | 1097 | 2 | 2013-01-01 | 5961 | 1405 | 1 | 0 | a | 1 | b | ... | NaN | NaN | NaN | 2013 | 1 | 1 | 1 | 130.0 | 0.000000 | 0 |
844392 rows × 25 columns
= [
input_cols "Store",
"DayOfWeek",
"Promo",
"StateHoliday",
"SchoolHoliday",
"StoreType",
"Assortment",
"CompetitionDistance",
"CompetitionOpen",
"Day",
"Month",
"Year",
"WeekOfYear",
"Promo2",
"Promo2Open",
"IsPromo2Month",
]= "Sales" target_col
= train_df[input_cols].copy()
inputs = train_df[target_col].copy() targets
= test_df[input_cols].copy() test_inputs
= [
numeric_cols "Store",
"Promo",
"SchoolHoliday",
"CompetitionDistance",
"CompetitionOpen",
"Promo2",
"Promo2Open",
"IsPromo2Month",
"Day",
"Month",
"Year",
"WeekOfYear",
]= ["DayOfWeek", "StateHoliday", "StoreType", "Assortment"] categorical_cols
sum() inputs[numeric_cols].isna().
Store 0
Promo 0
SchoolHoliday 0
CompetitionDistance 2186
CompetitionOpen 0
Promo2 0
Promo2Open 0
IsPromo2Month 0
Day 0
Month 0
Year 0
WeekOfYear 0
dtype: int64
sum() test_inputs[numeric_cols].isna().
Store 0
Promo 0
SchoolHoliday 0
CompetitionDistance 96
CompetitionOpen 0
Promo2 0
Promo2Open 0
IsPromo2Month 0
Day 0
Month 0
Year 0
WeekOfYear 0
dtype: int64
= inputs["CompetitionDistance"].max() max_distance
"CompetitionDistance"] = inputs["CompetitionDistance"].fillna(max_distance)
inputs["CompetitionDistance"] = test_inputs["CompetitionDistance"].fillna(max_distance) test_inputs[
= MinMaxScaler().fit(inputs[numeric_cols]) scaler
= scaler.transform(inputs[numeric_cols])
inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols]) test_inputs[numeric_cols]
= OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(inputs[categorical_cols])
encoder = list(encoder.get_feature_names_out(categorical_cols)) encoded_cols
= encoder.transform(inputs[categorical_cols])
inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols]) test_inputs[encoded_cols]
= inputs[numeric_cols + encoded_cols]
X = test_inputs[numeric_cols + encoded_cols] X_test